Data Inspection¶
In [1]:
import plotly
plotly.offline.init_notebook_mode()
In [2]:
import pandas as pd
from tqdm import tqdm
df_list = list()
chunk_iter = pd.read_csv(
"../data/Total_Data_10Y_Top24.csv",
chunksize=100000,
dtype = {
"CANCELLATION_CODE": str,
}
)
for chunk in tqdm(chunk_iter):
df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:29, 4.44it/s]
Out[2]:
| FL_DATE | OP_UNIQUE_CARRIER | TAIL_NUM | ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEST | ... | TAXI_IN | ARR_TIME | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | AA | N002AA | 1105703 | 31057 | CLT | Charlotte, NC | 1129803 | 30194 | DFW | ... | 28.0 | 1214.0 | 9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2014-07-01 | AA | N002AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1105703 | 31057 | CLT | ... | 13.0 | 945.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2014-07-01 | AA | N004AA | 1039705 | 30397 | ATL | Atlanta, GA | 1129803 | 30194 | DFW | ... | 6.0 | 1341.0 | -9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 7.0 | 1159.0 | 4.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 6.0 | 2317.0 | 2.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 24 columns
Data Cleaning¶
In [4]:
import pandas as pd
df.dropna(subset = ['DEP_TIME', 'ARR_TIME'], inplace = True)
def convert_to_timestamp(date, start_time, end_time, return_time = "start"):
start_time_str = str(int(start_time)).zfill(4)
start_hour = int(start_time_str[:2])
start_minute = int(start_time_str[2:])
end_time_str = str(int(end_time)).zfill(4)
end_hour = int(end_time_str[:2])
end_minute = int(end_time_str[2:])
try:
if return_time == "start":
if start_hour == 24:
date = date + pd.DateOffset(days=1)
start_hour = start_hour%24
return pd.Timestamp(
year=date.year,
month=date.month,
day=date.day,
hour=start_hour,
minute=start_minute
)
else:
if end_time < start_time or end_hour == 24:
date = date + pd.DateOffset(days=1)
end_hour = end_hour%24
return pd.Timestamp(
year=date.year,
month=date.month,
day=date.day,
hour=end_hour,
minute=end_minute
)
except:
print(start_time, end_time)
return None
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df['Start_Timestamp'] = df.apply(lambda row: convert_to_timestamp(row['FL_DATE'], row['DEP_TIME'], row['ARR_TIME'], "start"), axis=1)
df['End_Timestamp'] = df.apply(lambda row: convert_to_timestamp(row['FL_DATE'], row['DEP_TIME'], row['ARR_TIME'], "end"), axis=1)
df.head()
Out[4]:
| FL_DATE | OP_UNIQUE_CARRIER | TAIL_NUM | ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEST | ... | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | Start_Timestamp | End_Timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | AA | N002AA | 1105703 | 31057 | CLT | Charlotte, NC | 1129803 | 30194 | DFW | ... | 9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2014-07-01 10:25:00 | 2014-07-01 12:14:00 |
| 1 | 2014-07-01 | AA | N002AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1105703 | 31057 | CLT | ... | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2014-07-01 06:21:00 | 2014-07-01 09:45:00 |
| 2 | 2014-07-01 | AA | N004AA | 1039705 | 30397 | ATL | Atlanta, GA | 1129803 | 30194 | DFW | ... | -9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2014-07-01 12:38:00 | 2014-07-01 13:41:00 |
| 3 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 4.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2014-07-01 09:04:00 | 2014-07-01 11:59:00 |
| 4 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 2.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2014-07-01 20:16:00 | 2014-07-01 23:17:00 |
5 rows × 26 columns
In [5]:
df.columns
Out[5]:
Index(['FL_DATE', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'ORIGIN_AIRPORT_SEQ_ID',
'ORIGIN_CITY_MARKET_ID', 'ORIGIN', 'ORIGIN_CITY_NAME',
'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME',
'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'TAXI_IN', 'ARR_TIME', 'ARR_DELAY',
'CANCELLED', 'CANCELLATION_CODE', 'CARRIER_DELAY', 'WEATHER_DELAY',
'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'Start_Timestamp',
'End_Timestamp'],
dtype='object')
In [6]:
import matplotlib.pyplot as plt
from datetime import time
target_df = df.drop(columns = [
"FL_DATE",
"OP_UNIQUE_CARRIER",
"TAIL_NUM",
"ORIGIN_AIRPORT_SEQ_ID",
"ORIGIN_CITY_MARKET_ID",
"DEST_AIRPORT_SEQ_ID",
"DEST_CITY_MARKET_ID",
'DEP_TIME',
'ARR_TIME',
"TAXI_OUT",
"TAXI_IN",
])
del df
Visualization¶
Route¶
In [7]:
target_df['Route'] = target_df.apply(lambda row: row['ORIGIN_CITY_NAME'] + " -> " + row['DEST_CITY_NAME'], axis = 1)
target_df.head()
Out[7]:
| ORIGIN | ORIGIN_CITY_NAME | DEST | DEST_CITY_NAME | DEP_DELAY | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | Start_Timestamp | End_Timestamp | Route | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CLT | Charlotte, NC | DFW | Dallas/Fort Worth, TX | -5.0 | 9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2014-07-01 10:25:00 | 2014-07-01 12:14:00 | Charlotte, NC -> Dallas/Fort Worth, TX |
| 1 | DFW | Dallas/Fort Worth, TX | CLT | Charlotte, NC | -4.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2014-07-01 06:21:00 | 2014-07-01 09:45:00 | Dallas/Fort Worth, TX -> Charlotte, NC |
| 2 | ATL | Atlanta, GA | DFW | Dallas/Fort Worth, TX | -2.0 | -9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2014-07-01 12:38:00 | 2014-07-01 13:41:00 | Atlanta, GA -> Dallas/Fort Worth, TX |
| 3 | DFW | Dallas/Fort Worth, TX | ATL | Atlanta, GA | 14.0 | 4.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2014-07-01 09:04:00 | 2014-07-01 11:59:00 | Dallas/Fort Worth, TX -> Atlanta, GA |
| 4 | DFW | Dallas/Fort Worth, TX | ATL | Atlanta, GA | 1.0 | 2.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2014-07-01 20:16:00 | 2014-07-01 23:17:00 | Dallas/Fort Worth, TX -> Atlanta, GA |
In [33]:
##### target_airport_df_temp = target_airport_df.groupby('Route')['DEP_DELAY'].quantile(0.75).reset_index()
#target_airport_df_temp.sort_values("DEP_DELAY", ascending = False, inplace = True)
#target_airport_df_temp = target_airport_df_temp.iloc[:10, :]
'''
target_df_temp = target_df.groupby('Route').size().reset_index().rename(columns = {0: 'Route Use'})
target_df_temp.sort_values("Route Use", ascending = False, inplace = True)
target_df_temp = target_df_temp.iloc[:50, :]
target_df_temp
'''
target_df_temp = target_df.groupby('Route')["DEP_DELAY"].agg(Median_Delay = lambda x:x.quantile(0.5), Route_Use = "size").reset_index()
target_df_temp['Daily_Route_Use'] = target_df_temp.Route_Use/3654
#target_df_temp.sort_values("Median", ascending = False, inplace = True)
target_df_temp
Out[33]:
| Route | Median_Delay | Route_Use | Daily_Route_Use | |
|---|---|---|---|---|
| 0 | Atlanta, GA -> Charlotte, NC | -2.0 | 56939 | 15.582649 |
| 1 | Atlanta, GA -> Chicago, IL | -1.0 | 63768 | 17.451560 |
| 2 | Atlanta, GA -> Dallas, TX | -1.0 | 31748 | 8.688560 |
| 3 | Atlanta, GA -> Dallas/Fort Worth, TX | -1.0 | 63436 | 17.360701 |
| 4 | Atlanta, GA -> Denver, CO | -1.0 | 55613 | 15.219759 |
| ... | ... | ... | ... | ... |
| 499 | St. Louis, MO -> San Francisco, CA | 0.0 | 5637 | 1.542693 |
| 500 | St. Louis, MO -> San Jose, CA | 5.0 | 1399 | 0.382868 |
| 501 | St. Louis, MO -> San Juan, PR | 7.0 | 108 | 0.029557 |
| 502 | St. Louis, MO -> Santa Ana, CA | -1.0 | 724 | 0.198139 |
| 503 | St. Louis, MO -> Seattle, WA | -2.0 | 7261 | 1.987137 |
504 rows × 4 columns
In [37]:
import plotly.express as px
fig = px.scatter(target_df_temp, x = "Median_Delay", y = "Daily_Route_Use", title = "Median Delay vs Daily Route Use")
fig.show()
In [12]:
top5_delayed_route = pd.merge(
target_df,
target_df_temp,
how = "inner",
on = 'Route',
)
In [18]:
import plotly.express as px
import numpy as np
np.random.seed(1)
try:
top5_delayed_route = top5_delayed_route.sample(10000, replace = False)
except:
print("Too Few Rows")
fig = px.box(
top5_delayed_route.sort_values("DEP_DELAY", ascending = False),
x="Route",
y="DEP_DELAY",
log_y = True,
color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
height=600,
title_text=f"Route Departure Delay",
showlegend=True
)
fig.show()
Too Few Rows
for row in Each Row of Flight_Records:
Baseline = row.Arrival_Time - row.Departure_Time + row.Arrival_Delay
Optional_Flights = Filter(Flight_Records by Flight_Date +/- 1 Day and (Same Destination or Same Origin))
Optional_Flights = Optional_Flights JOIN Optional_Flights ON Mid_Point
Optional_Flights = Filter(Optional_Flights Transit_Time >= 60 min)
Optional_Flights.New_Time = Optional_Flights.Arrival_Time_2nd \
- Optional_Flights.Departure_Time_1st \
+ Optional_Flights.Arrival_Delay_2nd
Optional_Routes = Group Optional_Flights by Routes
Possible_Routes = Filter(Optional_Routes by New_Time.Median < Baseline)
return the Mean, Median, SD of Possible_Suggestion
In [52]:
from datetime import timedelta
from tqdm import tqdm
import numpy as np
import json
def improve(dep_time, arr_time, arr_delay, origin, dest, df):
# Give flight selection window
temp_df = df[
(df['Start_Timestamp'] >= (dep_time - timedelta(days = 1))) \
& (df['Start_Timestamp'] <= (dep_time + timedelta(days = 1)))
]
temp_df = temp_df[
(temp_df.ORIGIN_CITY_NAME == origin) \
| (temp_df.DEST_CITY_NAME == dest)
]
# All Possible Alternative Routes
route_suggest = pd.merge(
temp_df, temp_df,
how = "inner", left_on = "DEST_CITY_NAME", right_on = "ORIGIN_CITY_NAME",
suffixes = ["_1st", "_2nd"]
)
route_suggest = route_suggest[
(route_suggest['ORIGIN_CITY_NAME_1st'] == origin) \
& (route_suggest['DEST_CITY_NAME_2nd'] == dest)
]
if len(route_suggest) == 0: return None
# Give 1 Hour Transit Time
route_suggest = route_suggest[
((route_suggest['Start_Timestamp_2nd'] - route_suggest['End_Timestamp_1st']).apply(lambda x:x.total_seconds() // 60)) \
> (route_suggest['ARR_DELAY_1st'] + 60)
]
if len(route_suggest) == 0: return None
# Alternative Route Time
route_suggest['new_time'] = route_suggest.apply(
lambda row: row['ARR_DELAY_2nd'],
axis = 1
)
if len(route_suggest) == 0: return None
# Alternative Route Consistency
output = route_suggest.groupby(
['Route_1st']
)['new_time'].agg(['median', 'mean', 'std', 'max', 'min']).reset_index()
output.sort_values("median", inplace = True)
output = output[output['median'] < arr_delay]
if len(output) == 0: return None
output['Original_Departure_Time'] = dep_time
output['Original_ORIGIN_CITY'] = origin
output['Original_DEST_CITY'] = dest
output['Original_Time_Taken'] = arr_delay
return output
routes_df_list = list()
for i, row in tqdm(top5_delayed_route.iterrows()):
route = improve(
row["Start_Timestamp"],
row["End_Timestamp"],
row["ARR_DELAY"],
row["ORIGIN_CITY_NAME"],
row["DEST_CITY_NAME"],
target_df
)
if route is not None:
routes_df_list.append(route)
pd.concat(routes_df_list)
7092it [12:18, 9.61it/s]
Out[52]:
| Route_1st | median | mean | std | max | min | Original_Departure_Time | Original_ORIGIN_CITY | Original_DEST_CITY | Original_Time_Taken | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3 | Houston, TX -> Denver, CO | -14.0 | 11.777778 | 30.568684 | 44.0 | -14.0 | 2014-07-01 18:12:00 | Houston, TX | New York, NY | 27.0 |
| 6 | Houston, TX -> New Orleans, LA | -12.0 | -4.076923 | 18.895339 | 33.0 | -16.0 | 2014-07-01 18:12:00 | Houston, TX | New York, NY | 27.0 |
| 1 | Houston, TX -> Charlotte, NC | -11.0 | 2.076923 | 20.076776 | 33.0 | -17.0 | 2014-07-01 18:12:00 | Houston, TX | New York, NY | 27.0 |
| 10 | Houston, TX -> Raleigh/Durham, NC | -2.0 | 10.444444 | 43.090344 | 114.0 | -24.0 | 2014-07-01 18:12:00 | Houston, TX | New York, NY | 27.0 |
| 0 | Houston, TX -> Atlanta, GA | 3.0 | 6.031250 | 20.286025 | 56.0 | -14.0 | 2014-07-01 18:12:00 | Houston, TX | New York, NY | 27.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 0 | Houston, TX -> Atlanta, GA | -12.0 | -10.020619 | 6.864734 | 3.0 | -21.0 | 2018-12-31 11:39:00 | Houston, TX | New York, NY | -7.0 |
| 10 | Houston, TX -> Raleigh/Durham, NC | -10.0 | -6.888889 | 12.910957 | 16.0 | -22.0 | 2018-12-31 11:39:00 | Houston, TX | New York, NY | -7.0 |
| 6 | Houston, TX -> Miami, FL | -10.0 | -5.818182 | 32.771384 | 79.0 | -36.0 | 2018-12-31 11:39:00 | Houston, TX | New York, NY | -7.0 |
| 3 | Houston, TX -> Denver, CO | -9.0 | -6.846154 | 14.006650 | 19.0 | -23.0 | 2018-12-31 11:39:00 | Houston, TX | New York, NY | -7.0 |
| 12 | Houston, TX -> Santa Ana, CA | -8.0 | -8.000000 | NaN | -8.0 | -8.0 | 2018-12-31 11:39:00 | Houston, TX | New York, NY | -7.0 |
44859 rows × 10 columns
In [53]:
alt_routes = pd.concat(routes_df_list)
merged_temp_df = pd.merge(
top5_delayed_route,
alt_routes,
how = "left",
left_on=['Start_Timestamp', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME'],
right_on=['Original_Departure_Time', 'Original_ORIGIN_CITY', 'Original_DEST_CITY']
)
merged_temp_df
Out[53]:
| ORIGIN | ORIGIN_CITY_NAME | DEST | DEST_CITY_NAME | DEP_DELAY | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | ... | Route_1st | median | mean | std | max | min | Original_Departure_Time | Original_ORIGIN_CITY | Original_DEST_CITY | Original_Time_Taken | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | HOU | Houston, TX | JFK | New York, NY | -11.0 | -37.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN |
| 1 | HOU | Houston, TX | JFK | New York, NY | -8.0 | 27.0 | 0.0 | NaN | 0.0 | 0.0 | ... | Houston, TX -> Denver, CO | -14.0 | 11.777778 | 30.568684 | 44.0 | -14.0 | 2014-07-01 18:12:00 | Houston, TX | New York, NY | 27.0 |
| 2 | HOU | Houston, TX | JFK | New York, NY | -8.0 | 27.0 | 0.0 | NaN | 0.0 | 0.0 | ... | Houston, TX -> New Orleans, LA | -12.0 | -4.076923 | 18.895339 | 33.0 | -16.0 | 2014-07-01 18:12:00 | Houston, TX | New York, NY | 27.0 |
| 3 | HOU | Houston, TX | JFK | New York, NY | -8.0 | 27.0 | 0.0 | NaN | 0.0 | 0.0 | ... | Houston, TX -> Charlotte, NC | -11.0 | 2.076923 | 20.076776 | 33.0 | -17.0 | 2014-07-01 18:12:00 | Houston, TX | New York, NY | 27.0 |
| 4 | HOU | Houston, TX | JFK | New York, NY | -8.0 | 27.0 | 0.0 | NaN | 0.0 | 0.0 | ... | Houston, TX -> Raleigh/Durham, NC | -2.0 | 10.444444 | 43.090344 | 114.0 | -24.0 | 2014-07-01 18:12:00 | Houston, TX | New York, NY | 27.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 46522 | HOU | Houston, TX | JFK | New York, NY | 1.0 | -7.0 | 0.0 | NaN | NaN | NaN | ... | Houston, TX -> Miami, FL | -10.0 | -5.818182 | 32.771384 | 79.0 | -36.0 | 2018-12-31 11:39:00 | Houston, TX | New York, NY | -7.0 |
| 46523 | HOU | Houston, TX | JFK | New York, NY | 1.0 | -7.0 | 0.0 | NaN | NaN | NaN | ... | Houston, TX -> Denver, CO | -9.0 | -6.846154 | 14.006650 | 19.0 | -23.0 | 2018-12-31 11:39:00 | Houston, TX | New York, NY | -7.0 |
| 46524 | HOU | Houston, TX | JFK | New York, NY | 1.0 | -7.0 | 0.0 | NaN | NaN | NaN | ... | Houston, TX -> Santa Ana, CA | -8.0 | -8.000000 | NaN | -8.0 | -8.0 | 2018-12-31 11:39:00 | Houston, TX | New York, NY | -7.0 |
| 46525 | JFK | New York, NY | SMF | Sacramento, CA | 0.0 | -23.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN |
| 46526 | EWR | Newark, NJ | SMF | Sacramento, CA | 0.0 | -30.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN |
46527 rows × 27 columns
In [66]:
out = merged_temp_df.groupby(['Route']).agg({
"median" : ('median', 'mean'),
'Route_1st' : lambda x: sum(~x.isna()/len(x)*100)
}).reset_index()
In [68]:
out.columns = ['Route', 'Alternative Arr Delay Median', 'Alternative Arr Delay Mean', 'Example Route Reliability (%)']
out
Out[68]:
| Route | Alternative Arr Delay Median | Alternative Arr Delay Mean | Example Route Reliability (%) | |
|---|---|---|---|---|
| 0 | Houston, TX -> New York, NY | -9.5 | -7.283299 | 97.255575 |
| 1 | Miami, FL -> Portland, OR | -9.0 | -8.605536 | 99.029126 |
| 2 | New York, NY -> Sacramento, CA | -5.0 | -4.071500 | 94.410786 |
| 3 | Newark, NJ -> Sacramento, CA | -5.0 | -3.935500 | 97.435492 |
| 4 | San Juan, PR -> New Orleans, LA | -5.0 | -2.579310 | 99.315068 |
In [ ]: